Software of the Month Club 2000 October

home *** CD-ROM | disk | FTP | other *** search

/ Software of the Month Club 2000 October / Software of the Month - Ultimate Collection Shareware 277.iso / pc / PROGRAMS / UTILITY / WINLINUX / DATA1.CAB / programs_-_kernel_source / KERNEL / SCHED.C < prev next >

Wrap

C/C++ Source or Header | 1999-09-17 | 48KB | 1,955 lines

/* * linux/kernel/sched.c * * Copyright (C) 1991, 1992 Linus Torvalds * * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and * make semaphores SMP safe * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 * "A Kernel Model for Precision Timekeeping" by Dave Mills * 1998-11-19 Implemented schedule_timeout() and related stuff * by Andrea Arcangeli * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to * serialize accesses to xtime/lost_ticks). * Copyright (C) 1998 Andrea Arcangeli * 1998-12-28 Implemented better SMP scheduling by Ingo Molnar * 1999-03-10 Improved NTP compatibility by Ulrich Windl */ /* * 'sched.c' is the main kernel file. It contains scheduling primitives * (sleep_on, wakeup, schedule etc) as well as a number of simple system * call functions (type getpid()), which just extract a field from * current-task */ #include <linux/mm.h> #include <linux/kernel_stat.h> #include <linux/fdreg.h> #include <linux/delay.h> #include <linux/interrupt.h> #include <linux/smp_lock.h> #include <linux/init.h> #include <asm/io.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/mmu_context.h> #include <asm/semaphore-helper.h> #include <linux/timex.h> /* * kernel variables */ unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */ long tick = (1000000 + HZ/2) / HZ; /* timer interrupt period */ /* The current time */ volatile struct timeval xtime __attribute__ ((aligned (16))); /* Don't completely fail for HZ > 500. */ int tickadj = 500/HZ ? : 1; /* microsecs */ DECLARE_TASK_QUEUE(tq_timer); DECLARE_TASK_QUEUE(tq_immediate); DECLARE_TASK_QUEUE(tq_scheduler); /* * phase-lock loop variables */ /* TIME_ERROR prevents overwriting the CMOS clock */ int time_state = TIME_OK; /* clock synchronization status */ int time_status = STA_UNSYNC; /* clock status bits */ long time_offset = 0; /* time adjustment (us) */ long time_constant = 2; /* pll time constant */ long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ long time_precision = 1; /* clock precision (us) */ long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ long time_phase = 0; /* phase offset (scaled us) */ long time_freq = ((1000000 + HZ/2) % HZ - HZ/2) << SHIFT_USEC; /* frequency offset (scaled ppm) */ long time_adj = 0; /* tick adjust (scaled 1 / HZ) */ long time_reftime = 0; /* time at last adjustment (s) */ long time_adjust = 0; long time_adjust_step = 0; unsigned long event = 0; extern int do_setitimer(int, struct itimerval *, struct itimerval *); unsigned int * prof_buffer = NULL; unsigned long prof_len = 0; unsigned long prof_shift = 0; extern void mem_use(void); unsigned long volatile jiffies=0; /* * Init task must be ok at boot for the ix86 as we will check its signals * via the SMP irq return path. */ struct task_struct * task[NR_TASKS] = {&init_task, }; struct kernel_stat kstat = { 0 }; void scheduling_functions_start_here(void) { } #ifdef __SMP__ static void reschedule_idle_slow(struct task_struct * p) { /* * (see reschedule_idle() for an explanation first ...) * * Pass #2 * * We try to find another (idle) CPU for this woken-up process. * * On SMP, we mostly try to see if the CPU the task used * to run on is idle.. but we will use another idle CPU too, * at this point we already know that this CPU is not * willing to reschedule in the near future. * * An idle CPU is definitely wasted, especially if this CPU is * running long-timeslice processes. The following algorithm is * pretty good at finding the best idle CPU to send this process * to. * * [We can try to preempt low-priority processes on other CPUs in * 2.3. Also we can try to use the avg_slice value to predict * 'likely reschedule' events even on other CPUs.] */ int best_cpu = p->processor, this_cpu = smp_processor_id(); struct task_struct **idle = task, *tsk, *target_tsk; int i = smp_num_cpus; target_tsk = NULL; do { tsk = *idle; idle++; if (tsk->has_cpu) { if (tsk->processor == this_cpu) continue; target_tsk = tsk; if (tsk->processor == best_cpu) { /* * bingo, we couldnt get a better * CPU, activate it. */ goto send; /* this one helps GCC ... */ } } } while (--i > 0); /* * found any idle CPU? */ if (target_tsk) { send: target_tsk->need_resched = 1; smp_send_reschedule(target_tsk->processor); return; } } #endif /* __SMP__ */ /* * If there is a dependency between p1 and p2, * don't be too eager to go into the slow schedule. * In particular, if p1 and p2 both want the kernel * lock, there is no point in trying to make them * extremely parallel.. * * (No lock - lock_depth < 0) */ #define related(p1,p2) ((p1)->lock_depth >= 0 && (p2)->lock_depth >= 0) static inline void reschedule_idle(struct task_struct * p) { if (p->policy != SCHED_OTHER || p->counter > current->counter + 3) { current->need_resched = 1; return; } #ifdef __SMP__ /* * ("wakeup()" should not be called before we've initialized * SMP completely. * Basically a not-yet initialized SMP subsystem can be * considered as a not-yet working scheduler, simply dont use * it before it's up and running ...) * * SMP rescheduling is done in 2 passes: * - pass #1: faster: 'quick decisions' * - pass #2: slower: 'lets try and find another CPU' */ /* * Pass #1 * * There are two metrics here: * * first, a 'cutoff' interval, currently 0-200 usecs on * x86 CPUs, depending on the size of the 'SMP-local cache'. * If the current process has longer average timeslices than * this, then we utilize the idle CPU. * * second, if the wakeup comes from a process context, * then the two processes are 'related'. (they form a * 'gang') * * An idle CPU is almost always a bad thing, thus we skip * the idle-CPU utilization only if both these conditions * are true. (ie. a 'process-gang' rescheduling with rather * high frequency should stay on the same CPU). * * [We can switch to something more finegrained in 2.3.] */ if ((current->avg_slice < cacheflush_time) && related(current, p)) return; reschedule_idle_slow(p); #endif /* __SMP__ */ } /* * Careful! * * This has to add the process to the _beginning_ of the * run-queue, not the end. See the comment about "This is * subtle" in the scheduler proper.. */ static inline void add_to_runqueue(struct task_struct * p) { struct task_struct *next = init_task.next_run; p->prev_run = &init_task; init_task.next_run = p; p->next_run = next; next->prev_run = p; nr_running++; } static inline void del_from_runqueue(struct task_struct * p) { struct task_struct *next = p->next_run; struct task_struct *prev = p->prev_run; nr_running--; next->prev_run = prev; prev->next_run = next; p->next_run = NULL; p->prev_run = NULL; } static inline void move_last_runqueue(struct task_struct * p) { struct task_struct *next = p->next_run; struct task_struct *prev = p->prev_run; /* remove from list */ next->prev_run = prev; prev->next_run = next; /* add back to list */ p->next_run = &init_task; prev = init_task.prev_run; init_task.prev_run = p; p->prev_run = prev; prev->next_run = p; } static inline void move_first_runqueue(struct task_struct * p) { struct task_struct *next = p->next_run; struct task_struct *prev = p->prev_run; /* remove from list */ next->prev_run = prev; prev->next_run = next; /* add back to list */ p->prev_run = &init_task; next = init_task.next_run; init_task.next_run = p; p->next_run = next; next->prev_run = p; } /* * The tasklist_lock protects the linked list of processes. * * The scheduler lock is protecting against multiple entry * into the scheduling code, and doesn't need to worry * about interrupts (because interrupts cannot call the * scheduler). * * The run-queue lock locks the parts that actually access * and change the run-queues, and have to be interrupt-safe. */ spinlock_t scheduler_lock = SPIN_LOCK_UNLOCKED; /* should be acquired first */ spinlock_t runqueue_lock = SPIN_LOCK_UNLOCKED; /* second */ rwlock_t tasklist_lock = RW_LOCK_UNLOCKED; /* third */ /* * Wake up a process. Put it on the run-queue if it's not * already there. The "current" process is always on the * run-queue (except when the actual re-schedule is in * progress), and as such you're allowed to do the simpler * "current->state = TASK_RUNNING" to mark yourself runnable * without the overhead of this. */ void wake_up_process(struct task_struct * p) { unsigned long flags; spin_lock_irqsave(&runqueue_lock, flags); p->state = TASK_RUNNING; if (!p->next_run) { add_to_runqueue(p); reschedule_idle(p); } spin_unlock_irqrestore(&runqueue_lock, flags); } static void process_timeout(unsigned long __data) { struct task_struct * p = (struct task_struct *) __data; wake_up_process(p); } /* * This is the function that decides how desirable a process is.. * You can weigh different processes against each other depending * on what CPU they've run on lately etc to try to handle cache * and TLB miss penalties. * * Return values: * -1000: never select this * 0: out of time, recalculate counters (but it might still be * selected) * +ve: "goodness" value (the larger, the better) * +1000: realtime process, select this. */ static inline int goodness(struct task_struct * p, struct task_struct * prev, int this_cpu) { int policy = p->policy; int weight; if (policy & SCHED_YIELD) { p->policy = policy & ~SCHED_YIELD; return 0; } /* * Realtime process, select the first one on the * runqueue (taking priorities within processes * into account). */ if (policy != SCHED_OTHER) return 1000 + p->rt_priority; /* * Give the process a first-approximation goodness value * according to the number of clock-ticks it has left. * * Don't do any other calculations if the time slice is * over.. */ weight = p->counter; if (weight) { #ifdef __SMP__ /* Give a largish advantage to the same processor... */ /* (this is equivalent to penalizing other processors) */ if (p->processor == this_cpu) weight += PROC_CHANGE_PENALTY; #endif /* .. and a slight advantage to the current thread */ if (p->mm == prev->mm) weight += 1; weight += p->priority; } return weight; } /* * Event timer code */ #define TVN_BITS 6 #define TVR_BITS 8 #define TVN_SIZE (1 << TVN_BITS) #define TVR_SIZE (1 << TVR_BITS) #define TVN_MASK (TVN_SIZE - 1) #define TVR_MASK (TVR_SIZE - 1) struct timer_vec { int index; struct timer_list *vec[TVN_SIZE]; }; struct timer_vec_root { int index; struct timer_list *vec[TVR_SIZE]; }; static struct timer_vec tv5 = { 0 }; static struct timer_vec tv4 = { 0 }; static struct timer_vec tv3 = { 0 }; static struct timer_vec tv2 = { 0 }; static struct timer_vec_root tv1 = { 0 }; static struct timer_vec * const tvecs[] = { (struct timer_vec *)&tv1, &tv2, &tv3, &tv4, &tv5 }; #define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0])) static unsigned long timer_jiffies = 0; static inline void insert_timer(struct timer_list *timer, struct timer_list **vec, int idx) { if ((timer->next = vec[idx])) vec[idx]->prev = timer; vec[idx] = timer; timer->prev = (struct timer_list *)&vec[idx]; } static inline void internal_add_timer(struct timer_list *timer) { /* * must be cli-ed when calling this */ unsigned long expires = timer->expires; unsigned long idx = expires - timer_jiffies; if (idx < TVR_SIZE) { int i = expires & TVR_MASK; insert_timer(timer, tv1.vec, i); } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { int i = (expires >> TVR_BITS) & TVN_MASK; insert_timer(timer, tv2.vec, i); } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; insert_timer(timer, tv3.vec, i); } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; insert_timer(timer, tv4.vec, i); } else if ((signed long) idx < 0) { /* can happen if you add a timer with expires == jiffies, * or you set a timer to go off in the past */ insert_timer(timer, tv1.vec, tv1.index); } else if (idx <= 0xffffffffUL) { int i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; insert_timer(timer, tv5.vec, i); } else { /* Can only get here on architectures with 64-bit jiffies */ timer->next = timer->prev = timer; } } spinlock_t timerlist_lock = SPIN_LOCK_UNLOCKED; void add_timer(struct timer_list *timer) { unsigned long flags; spin_lock_irqsave(&timerlist_lock, flags); if (timer->prev) goto bug; internal_add_timer(timer); out: spin_unlock_irqrestore(&timerlist_lock, flags); return; bug: printk("bug: kernel timer added twice at %p.\n", __builtin_return_address(0)); goto out; } static inline int detach_timer(struct timer_list *timer) { struct timer_list *prev = timer->prev; if (prev) { struct timer_list *next = timer->next; prev->next = next; if (next) next->prev = prev; return 1; } return 0; } void mod_timer(struct timer_list *timer, unsigned long expires) { unsigned long flags; spin_lock_irqsave(&timerlist_lock, flags); timer->expires = expires; detach_timer(timer); internal_add_timer(timer); spin_unlock_irqrestore(&timerlist_lock, flags); } int del_timer(struct timer_list * timer) { int ret; unsigned long flags; spin_lock_irqsave(&timerlist_lock, flags); ret = detach_timer(timer); timer->next = timer->prev = 0; spin_unlock_irqrestore(&timerlist_lock, flags); return ret; } #ifdef __SMP__ #define idle_task (task[cpu_number_map[this_cpu]]) #define can_schedule(p) (!(p)->has_cpu) #else #define idle_task (&init_task) #define can_schedule(p) (1) #endif signed long schedule_timeout(signed long timeout) { struct timer_list timer; unsigned long expire; switch (timeout) { case MAX_SCHEDULE_TIMEOUT: /* * These two special cases are useful to be comfortable * in the caller. Nothing more. We could take * MAX_SCHEDULE_TIMEOUT from one of the negative value * but I' d like to return a valid offset (>=0) to allow * the caller to do everything it want with the retval. */ schedule(); goto out; default: /* * Another bit of PARANOID. Note that the retval will be * 0 since no piece of kernel is supposed to do a check * for a negative retval of schedule_timeout() (since it * should never happens anyway). You just have the printk() * that will tell you if something is gone wrong and where. */ if (timeout < 0) { printk(KERN_ERR "schedule_timeout: wrong timeout " "value %lx from %p\n", timeout, __builtin_return_address(0)); goto out; } } expire = timeout + jiffies; init_timer(&timer); timer.expires = expire; timer.data = (unsigned long) current; timer.function = process_timeout; add_timer(&timer); schedule(); del_timer(&timer); timeout = expire - jiffies; out: return timeout < 0 ? 0 : timeout; } /* * This one aligns per-CPU data on cacheline boundaries. */ static union { struct schedule_data { struct task_struct * prev; long prevstate; cycles_t last_schedule; } schedule_data; char __pad [SMP_CACHE_BYTES]; } aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}}; static inline void __schedule_tail (void) { #ifdef __SMP__ struct schedule_data * sched_data; /* * We might have switched CPUs: */ sched_data = & aligned_data[smp_processor_id()].schedule_data; /* * Subtle. In the rare event that we got a wakeup to 'prev' just * during the reschedule (this is possible, the scheduler is pretty * parallel), we should do another reschedule in the next task's * context. schedule() will do the right thing next time around. * this is equivalent to 'delaying' the wakeup until the reschedule * has finished. */ if (sched_data->prev->state != sched_data->prevstate) current->need_resched = 1; /* * Release the previous process ... * * We have dropped all locks, and we must make sure that we * only mark the previous process as no longer having a CPU * after all other state has been seen by other CPU's. Thus * the write memory barrier! */ wmb(); sched_data->prev->has_cpu = 0; #endif /* __SMP__ */ } /* * schedule_tail() is getting called from the fork return path. This * cleans up all remaining scheduler things, without impacting the * common case. */ void schedule_tail (void) { __schedule_tail(); } /* * 'schedule()' is the scheduler function. It's a very simple and nice * scheduler: it's not perfect, but certainly works for most things. * * The goto is "interesting". * * NOTE!! Task 0 is the 'idle' task, which gets called when no other * tasks can run. It can not be killed, and it cannot sleep. The 'state' * information in task[0] is never used. */ asmlinkage void schedule(void) { struct schedule_data * sched_data; struct task_struct * prev, * next; int this_cpu; run_task_queue(&tq_scheduler); prev = current; this_cpu = prev->processor; /* * 'sched_data' is protected by the fact that we can run * only one process per CPU. */ sched_data = & aligned_data[this_cpu].schedule_data; if (in_interrupt()) goto scheduling_in_interrupt; release_kernel_lock(prev, this_cpu); /* Do "administrative" work here while we don't hold any locks */ if (bh_active & bh_mask) do_bottom_half(); spin_lock(&scheduler_lock); spin_lock_irq(&runqueue_lock); /* move an exhausted RR process to be last.. */ prev->need_resched = 0; if (!prev->counter && prev->policy == SCHED_RR) { prev->counter = prev->priority; move_last_runqueue(prev); } switch (prev->state) { case TASK_INTERRUPTIBLE: if (signal_pending(prev)) { prev->state = TASK_RUNNING; break; } default: del_from_runqueue(prev); case TASK_RUNNING: } sched_data->prevstate = prev->state; /* this is the scheduler proper: */ { struct task_struct * p = init_task.next_run; int c = -1000; /* Default process to select.. */ next = idle_task; if (prev->state == TASK_RUNNING) { c = goodness(prev, prev, this_cpu); next = prev; } /* * This is subtle. * Note how we can enable interrupts here, even * though interrupts can add processes to the run- * queue. This is because any new processes will * be added to the front of the queue, so "p" above * is a safe starting point. * run-queue deletion and re-ordering is protected by * the scheduler lock */ spin_unlock_irq(&runqueue_lock); /* * Note! there may appear new tasks on the run-queue during this, as * interrupts are enabled. However, they will be put on front of the * list, so our list starting at "p" is essentially fixed. */ while (p != &init_task) { if (can_schedule(p)) { int weight = goodness(p, prev, this_cpu); if (weight > c) c = weight, next = p; } p = p->next_run; } /* Do we need to re-calculate counters? */ if (!c) { struct task_struct *p; read_lock(&tasklist_lock); for_each_task(p) p->counter = (p->counter >> 1) + p->priority; read_unlock(&tasklist_lock); } } /* * maintain the per-process 'average timeslice' value. * (this has to be recalculated even if we reschedule to * the same process) Currently this is only used on SMP: */ #ifdef __SMP__ { cycles_t t, this_slice; t = get_cycles(); this_slice = t - sched_data->last_schedule; sched_data->last_schedule = t; /* * Simple, exponentially fading average calculation: */ prev->avg_slice = this_slice + prev->avg_slice; prev->avg_slice >>= 1; } /* * We drop the scheduler lock early (it's a global spinlock), * thus we have to lock the previous process from getting * rescheduled during switch_to(). */ next->processor = this_cpu; next->has_cpu = 1; spin_unlock(&scheduler_lock); #endif /* __SMP__ */ if (prev != next) { #ifdef __SMP__ sched_data->prev = prev; #endif kstat.context_swtch++; get_mmu_context(next); switch_to(prev,next); __schedule_tail(); } reacquire_kernel_lock(current); return; scheduling_in_interrupt: printk("Scheduling in interrupt\n"); *(int *)0 = 0; } rwlock_t waitqueue_lock = RW_LOCK_UNLOCKED; /* * wake_up doesn't wake up stopped processes - they have to be awakened * with signals or similar. * * Note that we only need a read lock for the wait queue (and thus do not * have to protect against interrupts), as the actual removal from the * queue is handled by the process itself. */ void __wake_up(struct wait_queue **q, unsigned int mode) { struct wait_queue *next; read_lock(&waitqueue_lock); if (q && (next = *q)) { struct wait_queue *head; head = WAIT_QUEUE_HEAD(q); while (next != head) { struct task_struct *p = next->task; next = next->next; if (p->state & mode) wake_up_process(p); } } read_unlock(&waitqueue_lock); } /* * Semaphores are implemented using a two-way counter: * The "count" variable is decremented for each process * that tries to sleep, while the "waking" variable is * incremented when the "up()" code goes to wake up waiting * processes. * * Notably, the inline "up()" and "down()" functions can * efficiently test if they need to do any extra work (up * needs to do something only if count was negative before * the increment operation. * * waking_non_zero() (from asm/semaphore.h) must execute * atomically. * * When __up() is called, the count was negative before * incrementing it, and we need to wake up somebody. * * This routine adds one to the count of processes that need to * wake up and exit. ALL waiting processes actually wake up but * only the one that gets to the "waking" field first will gate * through and acquire the semaphore. The others will go back * to sleep. * * Note that these functions are only called when there is * contention on the lock, and as such all this is the * "non-critical" part of the whole semaphore business. The * critical part is the inline stuff in <asm/semaphore.h> * where we want to avoid any extra jumps and calls. */ void __up(struct semaphore *sem) { wake_one_more(sem); wake_up(&sem->wait); } /* * Perform the "down" function. Return zero for semaphore acquired, * return negative for signalled out of the function. * * If called from __down, the return is ignored and the wait loop is * not interruptible. This means that a task waiting on a semaphore * using "down()" cannot be killed until someone does an "up()" on * the semaphore. * * If called from __down_interruptible, the return value gets checked * upon return. If the return value is negative then the task continues * with the negative value in the return register (it can be tested by * the caller). * * Either form may be used in conjunction with "up()". * */ #define DOWN_VAR \ struct task_struct *tsk = current; \ struct wait_queue wait = { tsk, NULL }; #define DOWN_HEAD(task_state) \ \ \ tsk->state = (task_state); \ add_wait_queue(&sem->wait, &wait); \ \ /* \ * Ok, we're set up. sem->count is known to be less than zero \ * so we must wait. \ * \ * We can let go the lock for purposes of waiting. \ * We re-acquire it after awaking so as to protect \ * all semaphore operations. \ * \ * If "up()" is called before we call waking_non_zero() then \ * we will catch it right away. If it is called later then \ * we will have to go through a wakeup cycle to catch it. \ * \ * Multiple waiters contend for the semaphore lock to see \ * who gets to gate through and who has to wait some more. \ */ \ for (;;) { #define DOWN_TAIL(task_state) \ tsk->state = (task_state); \ } \ tsk->state = TASK_RUNNING; \ remove_wait_queue(&sem->wait, &wait); void __down(struct semaphore * sem) { DOWN_VAR DOWN_HEAD(TASK_UNINTERRUPTIBLE) if (waking_non_zero(sem)) break; schedule(); DOWN_TAIL(TASK_UNINTERRUPTIBLE) } int __down_interruptible(struct semaphore * sem) { DOWN_VAR int ret = 0; DOWN_HEAD(TASK_INTERRUPTIBLE) ret = waking_non_zero_interruptible(sem, tsk); if (ret) { if (ret == 1) /* ret != 0 only if we get interrupted -arca */ ret = 0; break; } schedule(); DOWN_TAIL(TASK_INTERRUPTIBLE) return ret; } int __down_trylock(struct semaphore * sem) { return waking_non_zero_trylock(sem); } #define SLEEP_ON_VAR \ unsigned long flags; \ struct wait_queue wait; #define SLEEP_ON_HEAD \ wait.task = current; \ write_lock_irqsave(&waitqueue_lock, flags); \ __add_wait_queue(p, &wait); \ write_unlock(&waitqueue_lock); #define SLEEP_ON_TAIL \ write_lock_irq(&waitqueue_lock); \ __remove_wait_queue(p, &wait); \ write_unlock_irqrestore(&waitqueue_lock, flags); void interruptible_sleep_on(struct wait_queue **p) { SLEEP_ON_VAR current->state = TASK_INTERRUPTIBLE; SLEEP_ON_HEAD schedule(); SLEEP_ON_TAIL } long interruptible_sleep_on_timeout(struct wait_queue **p, long timeout) { SLEEP_ON_VAR current->state = TASK_INTERRUPTIBLE; SLEEP_ON_HEAD timeout = schedule_timeout(timeout); SLEEP_ON_TAIL return timeout; } void sleep_on(struct wait_queue **p) { SLEEP_ON_VAR current->state = TASK_UNINTERRUPTIBLE; SLEEP_ON_HEAD schedule(); SLEEP_ON_TAIL } long sleep_on_timeout(struct wait_queue **p, long timeout) { SLEEP_ON_VAR current->state = TASK_UNINTERRUPTIBLE; SLEEP_ON_HEAD timeout = schedule_timeout(timeout); SLEEP_ON_TAIL return timeout; } void scheduling_functions_end_here(void) { } static inline void cascade_timers(struct timer_vec *tv) { /* cascade all the timers from tv up one level */ struct timer_list *timer; timer = tv->vec[tv->index]; /* * We are removing _all_ timers from the list, so we don't have to * detach them individually, just clear the list afterwards. */ while (timer) { struct timer_list *tmp = timer; timer = timer->next; internal_add_timer(tmp); } tv->vec[tv->index] = NULL; tv->index = (tv->index + 1) & TVN_MASK; } static inline void run_timer_list(void) { spin_lock_irq(&timerlist_lock); while ((long)(jiffies - timer_jiffies) >= 0) { struct timer_list *timer; if (!tv1.index) { int n = 1; do { cascade_timers(tvecs[n]); } while (tvecs[n]->index == 1 && ++n < NOOF_TVECS); } while ((timer = tv1.vec[tv1.index])) { void (*fn)(unsigned long) = timer->function; unsigned long data = timer->data; detach_timer(timer); timer->next = timer->prev = NULL; spin_unlock_irq(&timerlist_lock); fn(data); spin_lock_irq(&timerlist_lock); } ++timer_jiffies; tv1.index = (tv1.index + 1) & TVR_MASK; } spin_unlock_irq(&timerlist_lock); } static inline void run_old_timers(void) { struct timer_struct *tp; unsigned long mask; for (mask = 1, tp = timer_table+0 ; mask ; tp++,mask += mask) { if (mask > timer_active) break; if (!(mask & timer_active)) continue; if (time_after(tp->expires, jiffies)) continue; timer_active &= ~mask; tp->fn(); sti(); } } spinlock_t tqueue_lock; void tqueue_bh(void) { run_task_queue(&tq_timer); } void immediate_bh(void) { run_task_queue(&tq_immediate); } unsigned long timer_active = 0; struct timer_struct timer_table[32]; /* * Hmm.. Changed this, as the GNU make sources (load.c) seems to * imply that avenrun[] is the standard name for this kind of thing. * Nothing else seems to be standardized: the fractional size etc * all seem to differ on different machines. */ unsigned long avenrun[3] = { 0,0,0 }; /* * Nr of active tasks - counted in fixed-point numbers */ static unsigned long count_active_tasks(void) { struct task_struct *p; unsigned long nr = 0; read_lock(&tasklist_lock); for_each_task(p) { if ((p->state == TASK_RUNNING || p->state == TASK_UNINTERRUPTIBLE || p->state == TASK_SWAPPING)) nr += FIXED_1; } read_unlock(&tasklist_lock); return nr; } static inline void calc_load(unsigned long ticks) { unsigned long active_tasks; /* fixed-point */ static int count = LOAD_FREQ; count -= ticks; if (count < 0) { count += LOAD_FREQ; active_tasks = count_active_tasks(); CALC_LOAD(avenrun[0], EXP_1, active_tasks); CALC_LOAD(avenrun[1], EXP_5, active_tasks); CALC_LOAD(avenrun[2], EXP_15, active_tasks); } } /* * this routine handles the overflow of the microsecond field * * The tricky bits of code to handle the accurate clock support * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. * They were originally developed for SUN and DEC kernels. * All the kudos should go to Dave for this stuff. * */ static void second_overflow(void) { long ltemp; /* Bump the maxerror field */ time_maxerror += time_tolerance >> SHIFT_USEC; if ( time_maxerror > NTP_PHASE_LIMIT ) { time_maxerror = NTP_PHASE_LIMIT; time_status |= STA_UNSYNC; } /* * Leap second processing. If in leap-insert state at * the end of the day, the system clock is set back one * second; if in leap-delete state, the system clock is * set ahead one second. The microtime() routine or * external clock driver will insure that reported time * is always monotonic. The ugly divides should be * replaced. */ switch (time_state) { case TIME_OK: if (time_status & STA_INS) time_state = TIME_INS; else if (time_status & STA_DEL) time_state = TIME_DEL; break; case TIME_INS: if (xtime.tv_sec % 86400 == 0) { xtime.tv_sec--; time_state = TIME_OOP; printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); } break; case TIME_DEL: if ((xtime.tv_sec + 1) % 86400 == 0) { xtime.tv_sec++; time_state = TIME_WAIT; printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); } break; case TIME_OOP: time_state = TIME_WAIT; break; case TIME_WAIT: if (!(time_status & (STA_INS | STA_DEL))) time_state = TIME_OK; } /* * Compute the phase adjustment for the next second. In * PLL mode, the offset is reduced by a fixed factor * times the time constant. In FLL mode the offset is * used directly. In either mode, the maximum phase * adjustment for each second is clamped so as to spread * the adjustment over not more than the number of * seconds between updates. */ if (time_offset < 0) { ltemp = -time_offset; if (!(time_status & STA_FLL)) ltemp >>= SHIFT_KG + time_constant; if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; time_offset += ltemp; time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); } else { ltemp = time_offset; if (!(time_status & STA_FLL)) ltemp >>= SHIFT_KG + time_constant; if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; time_offset -= ltemp; time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); } /* * Compute the frequency estimate and additional phase * adjustment due to frequency error for the next * second. When the PPS signal is engaged, gnaw on the * watchdog counter and update the frequency computed by * the pll and the PPS signal. */ pps_valid++; if (pps_valid == PPS_VALID) { /* PPS signal lost */ pps_jitter = MAXTIME; pps_stabil = MAXFREQ; time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); } ltemp = time_freq + pps_freq; if (ltemp < 0) time_adj -= -ltemp >> (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); else time_adj += ltemp >> (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); #if HZ == 100 /* Compensate for (HZ==100) != (1 << SHIFT_HZ). * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14) */ if (time_adj < 0) time_adj -= (-time_adj >> 2) + (-time_adj >> 5); else time_adj += (time_adj >> 2) + (time_adj >> 5); #endif } /* in the NTP reference this is called "hardclock()" */ static void update_wall_time_one_tick(void) { if ( (time_adjust_step = time_adjust) != 0 ) { /* We are doing an adjtime thing. * * Prepare time_adjust_step to be within bounds. * Note that a positive time_adjust means we want the clock * to run faster. * * Limit the amount of the step to be in the range * -tickadj .. +tickadj */ if (time_adjust > tickadj) time_adjust_step = tickadj; else if (time_adjust < -tickadj) time_adjust_step = -tickadj; /* Reduce by this step the amount of time left */ time_adjust -= time_adjust_step; } xtime.tv_usec += tick + time_adjust_step; /* * Advance the phase, once it gets to one microsecond, then * advance the tick more. */ time_phase += time_adj; if (time_phase <= -FINEUSEC) { long ltemp = -time_phase >> SHIFT_SCALE; time_phase += ltemp << SHIFT_SCALE; xtime.tv_usec -= ltemp; } else if (time_phase >= FINEUSEC) { long ltemp = time_phase >> SHIFT_SCALE; time_phase -= ltemp << SHIFT_SCALE; xtime.tv_usec += ltemp; } } /* * Using a loop looks inefficient, but "ticks" is * usually just one (we shouldn't be losing ticks, * we're doing this this way mainly for interrupt * latency reasons, not because we think we'll * have lots of lost timer ticks */ static void update_wall_time(unsigned long ticks) { do { ticks--; update_wall_time_one_tick(); } while (ticks); if (xtime.tv_usec >= 1000000) { xtime.tv_usec -= 1000000; xtime.tv_sec++; second_overflow(); } } static inline void do_process_times(struct task_struct *p, unsigned long user, unsigned long system) { long psecs; psecs = (p->times.tms_utime += user); psecs += (p->times.tms_stime += system); if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_cur) { /* Send SIGXCPU every second.. */ if (!(psecs % HZ)) send_sig(SIGXCPU, p, 1); /* and SIGKILL when we go over max.. */ if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_max) send_sig(SIGKILL, p, 1); } } static inline void do_it_virt(struct task_struct * p, unsigned long ticks) { unsigned long it_virt = p->it_virt_value; if (it_virt) { if (it_virt <= ticks) { it_virt = ticks + p->it_virt_incr; send_sig(SIGVTALRM, p, 1); } p->it_virt_value = it_virt - ticks; } } static inline void do_it_prof(struct task_struct * p, unsigned long ticks) { unsigned long it_prof = p->it_prof_value; if (it_prof) { if (it_prof <= ticks) { it_prof = ticks + p->it_prof_incr; send_sig(SIGPROF, p, 1); } p->it_prof_value = it_prof - ticks; } } void update_one_process(struct task_struct *p, unsigned long ticks, unsigned long user, unsigned long system, int cpu) { p->per_cpu_utime[cpu] += user; p->per_cpu_stime[cpu] += system; do_process_times(p, user, system); do_it_virt(p, user); do_it_prof(p, ticks); } static void update_process_times(unsigned long ticks, unsigned long system) { /* * SMP does this on a per-CPU basis elsewhere */ #ifndef __SMP__ struct task_struct * p = current; unsigned long user = ticks - system; if (p->pid) { p->counter -= ticks; if (p->counter < 0) { p->counter = 0; p->need_resched = 1; } if (p->priority < DEF_PRIORITY) kstat.cpu_nice += user; else kstat.cpu_user += user; kstat.cpu_system += system; } update_one_process(p, ticks, user, system, 0); #endif } volatile unsigned long lost_ticks = 0; static unsigned long lost_ticks_system = 0; /* * This spinlock protect us from races in SMP while playing with xtime. -arca */ rwlock_t xtime_lock = RW_LOCK_UNLOCKED; static inline void update_times(void) { unsigned long ticks; /* * update_times() is run from the raw timer_bh handler so we * just know that the irqs are locally enabled and so we don't * need to save/restore the flags of the local CPU here. -arca */ write_lock_irq(&xtime_lock); ticks = lost_ticks; lost_ticks = 0; if (ticks) { unsigned long system; system = xchg(&lost_ticks_system, 0); calc_load(ticks); update_wall_time(ticks); write_unlock_irq(&xtime_lock); update_process_times(ticks, system); } else write_unlock_irq(&xtime_lock); } static void timer_bh(void) { update_times(); run_old_timers(); run_timer_list(); } void do_timer(struct pt_regs * regs) { (*(unsigned long *)&jiffies)++; lost_ticks++; mark_bh(TIMER_BH); if (!user_mode(regs)) lost_ticks_system++; if (tq_timer) mark_bh(TQUEUE_BH); } #ifndef __alpha__ /* * For backwards compatibility? This can be done in libc so Alpha * and all newer ports shouldn't need it. */ asmlinkage unsigned int sys_alarm(unsigned int seconds) { struct itimerval it_new, it_old; unsigned int oldalarm; it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; it_new.it_value.tv_sec = seconds; it_new.it_value.tv_usec = 0; do_setitimer(ITIMER_REAL, &it_new, &it_old); oldalarm = it_old.it_value.tv_sec; /* ehhh.. We can't return 0 if we have an alarm pending.. */ /* And we'd better return too much than too little anyway */ if (it_old.it_value.tv_usec) oldalarm++; return oldalarm; } /* * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this * should be moved into arch/i386 instead? */ asmlinkage int sys_getpid(void) { /* This is SMP safe - current->pid doesn't change */ return current->pid; } /* * This is not strictly SMP safe: p_opptr could change * from under us. However, rather than getting any lock * we can use an optimistic algorithm: get the parent * pid, and go back and check that the parent is still * the same. If it has changed (which is extremely unlikely * indeed), we just try again.. * * NOTE! This depends on the fact that even if we _do_ * get an old value of "parent", we can happily dereference * the pointer: we just can't necessarily trust the result * until we know that the parent pointer is valid. * * The "mb()" macro is a memory barrier - a synchronizing * event. It also makes sure that gcc doesn't optimize * away the necessary memory references.. The barrier doesn't * have to have all that strong semantics: on x86 we don't * really require a synchronizing instruction, for example. * The barrier is more important for code generation than * for any real memory ordering semantics (even if there is * a small window for a race, using the old pointer is * harmless for a while). */ asmlinkage int sys_getppid(void) { int pid; struct task_struct * me = current; struct task_struct * parent; parent = me->p_opptr; for (;;) { pid = parent->pid; #if __SMP__ { struct task_struct *old = parent; mb(); parent = me->p_opptr; if (old != parent) continue; } #endif break; } return pid; } asmlinkage int sys_getuid(void) { /* Only we change this so SMP safe */ return current->uid; } asmlinkage int sys_geteuid(void) { /* Only we change this so SMP safe */ return current->euid; } asmlinkage int sys_getgid(void) { /* Only we change this so SMP safe */ return current->gid; } asmlinkage int sys_getegid(void) { /* Only we change this so SMP safe */ return current->egid; } /* * This has been replaced by sys_setpriority. Maybe it should be * moved into the arch dependent tree for those ports that require * it for backward compatibility? */ asmlinkage int sys_nice(int increment) { unsigned long newprio; int increase = 0; /* * Setpriority might change our priority at the same moment. * We don't have to worry. Conceptually one call occurs first * and we have a single winner. */ newprio = increment; if (increment < 0) { if (!capable(CAP_SYS_NICE)) return -EPERM; newprio = -increment; increase = 1; } if (newprio > 40) newprio = 40; /* * do a "normalization" of the priority (traditionally * Unix nice values are -20 to 20; Linux doesn't really * use that kind of thing, but uses the length of the * timeslice instead (default 210 ms). The rounding is * why we want to avoid negative values. */ newprio = (newprio * DEF_PRIORITY + 10) / 20; increment = newprio; if (increase) increment = -increment; /* * Current->priority can change between this point * and the assignment. We are assigning not doing add/subs * so thats ok. Conceptually a process might just instantaneously * read the value we stomp over. I don't think that is an issue * unless posix makes it one. If so we can loop on changes * to current->priority. */ newprio = current->priority - increment; if ((signed) newprio < 1) newprio = 1; if (newprio > DEF_PRIORITY*2) newprio = DEF_PRIORITY*2; current->priority = newprio; return 0; } #endif static inline struct task_struct *find_process_by_pid(pid_t pid) { struct task_struct *tsk = current; if (pid) tsk = find_task_by_pid(pid); return tsk; } static int setscheduler(pid_t pid, int policy, struct sched_param *param) { struct sched_param lp; struct task_struct *p; int retval; retval = -EINVAL; if (!param || pid < 0) goto out_nounlock; retval = -EFAULT; if (copy_from_user(&lp, param, sizeof(struct sched_param))) goto out_nounlock; /* * We play safe to avoid deadlocks. */ spin_lock(&scheduler_lock); spin_lock_irq(&runqueue_lock); read_lock(&tasklist_lock); p = find_process_by_pid(pid); retval = -ESRCH; if (!p) goto out_unlock; if (policy < 0) policy = p->policy; else { retval = -EINVAL; if (policy != SCHED_FIFO && policy != SCHED_RR && policy != SCHED_OTHER) goto out_unlock; } /* * Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid * priority for SCHED_OTHER is 0. */ retval = -EINVAL; if (lp.sched_priority < 0 || lp.sched_priority > 99) goto out_unlock; if ((policy == SCHED_OTHER) != (lp.sched_priority == 0)) goto out_unlock; retval = -EPERM; if ((policy == SCHED_FIFO || policy == SCHED_RR) && !capable(CAP_SYS_NICE)) goto out_unlock; if ((current->euid != p->euid) && (current->euid != p->uid) && !capable(CAP_SYS_NICE)) goto out_unlock; retval = 0; p->policy = policy; p->rt_priority = lp.sched_priority; if (p->next_run) move_first_runqueue(p); current->need_resched = 1; out_unlock: read_unlock(&tasklist_lock); spin_unlock_irq(&runqueue_lock); spin_unlock(&scheduler_lock); out_nounlock: return retval; } asmlinkage int sys_sched_setscheduler(pid_t pid, int policy, struct sched_param *param) { return setscheduler(pid, policy, param); } asmlinkage int sys_sched_setparam(pid_t pid, struct sched_param *param) { return setscheduler(pid, -1, param); } asmlinkage int sys_sched_getscheduler(pid_t pid) { struct task_struct *p; int retval; retval = -EINVAL; if (pid < 0) goto out_nounlock; read_lock(&tasklist_lock); retval = -ESRCH; p = find_process_by_pid(pid); if (!p) goto out_unlock; retval = p->policy; out_unlock: read_unlock(&tasklist_lock); out_nounlock: return retval; } asmlinkage int sys_sched_getparam(pid_t pid, struct sched_param *param) { struct task_struct *p; struct sched_param lp; int retval; retval = -EINVAL; if (!param || pid < 0) goto out_nounlock; read_lock(&tasklist_lock); p = find_process_by_pid(pid); retval = -ESRCH; if (!p) goto out_unlock; lp.sched_priority = p->rt_priority; read_unlock(&tasklist_lock); /* * This one might sleep, we cannot do it with a spinlock held ... */ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; out_nounlock: return retval; out_unlock: read_unlock(&tasklist_lock); return retval; } asmlinkage int sys_sched_yield(void) { spin_lock(&scheduler_lock); spin_lock_irq(&runqueue_lock); if (current->policy == SCHED_OTHER) current->policy |= SCHED_YIELD; current->need_resched = 1; move_last_runqueue(current); spin_unlock_irq(&runqueue_lock); spin_unlock(&scheduler_lock); return 0; } asmlinkage int sys_sched_get_priority_max(int policy) { int ret = -EINVAL; switch (policy) { case SCHED_FIFO: case SCHED_RR: ret = 99; break; case SCHED_OTHER: ret = 0; break; } return ret; } asmlinkage int sys_sched_get_priority_min(int policy) { int ret = -EINVAL; switch (policy) { case SCHED_FIFO: case SCHED_RR: ret = 1; break; case SCHED_OTHER: ret = 0; } return ret; } asmlinkage int sys_sched_rr_get_interval(pid_t pid, struct timespec *interval) { struct timespec t; t.tv_sec = 0; t.tv_nsec = 150000; if (copy_to_user(interval, &t, sizeof(struct timespec))) return -EFAULT; return 0; } asmlinkage int sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp) { struct timespec t; unsigned long expire; if(copy_from_user(&t, rqtp, sizeof(struct timespec))) return -EFAULT; if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 || t.tv_sec < 0) return -EINVAL; if (t.tv_sec == 0 && t.tv_nsec <= 2000000L && current->policy != SCHED_OTHER) { /* * Short delay requests up to 2 ms will be handled with * high precision by a busy wait for all real-time processes. * * Its important on SMP not to do this holding locks. */ udelay((t.tv_nsec + 999) / 1000); return 0; } expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); current->state = TASK_INTERRUPTIBLE; expire = schedule_timeout(expire); if (expire) { if (rmtp) { jiffies_to_timespec(expire, &t); if (copy_to_user(rmtp, &t, sizeof(struct timespec))) return -EFAULT; } return -EINTR; } return 0; } static void show_task(int nr,struct task_struct * p) { unsigned long free = 0; int state; static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" }; printk("%-8s %3d ", p->comm, (p == current) ? -nr : nr); state = p->state ? ffz(~p->state) + 1 : 0; if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *)) printk(stat_nam[state]); else printk(" "); #if (BITS_PER_LONG == 32) if (p == current) printk(" current "); else printk(" %08lX ", thread_saved_pc(&p->tss)); #else if (p == current) printk(" current task "); else printk(" %016lx ", thread_saved_pc(&p->tss)); #endif { unsigned long * n = (unsigned long *) (p+1); while (!*n) n++; free = (unsigned long) n - (unsigned long)(p+1); } printk("%5lu %5d %6d ", free, p->pid, p->p_pptr->pid); if (p->p_cptr) printk("%5d ", p->p_cptr->pid); else printk(" "); if (p->p_ysptr) printk("%7d", p->p_ysptr->pid); else printk(" "); if (p->p_osptr) printk(" %5d\n", p->p_osptr->pid); else printk("\n"); { struct signal_queue *q; char s[sizeof(sigset_t)*2+1], b[sizeof(sigset_t)*2+1]; render_sigset_t(&p->signal, s); render_sigset_t(&p->blocked, b); printk(" sig: %d %s %s :", signal_pending(p), s, b); for (q = p->sigqueue; q ; q = q->next) printk(" %d", q->info.si_signo); printk(" X\n"); } } char * render_sigset_t(sigset_t *set, char *buffer) { int i = _NSIG, x; do { i -= 4, x = 0; if (sigismember(set, i+1)) x |= 1; if (sigismember(set, i+2)) x |= 2; if (sigismember(set, i+3)) x |= 4; if (sigismember(set, i+4)) x |= 8; *buffer++ = (x < 10 ? '0' : 'a' - 10) + x; } while (i >= 4); *buffer = 0; return buffer; } void show_state(void) { struct task_struct *p; #if (BITS_PER_LONG == 32) printk("\n" " free sibling\n"); printk(" task PC stack pid father child younger older\n"); #else printk("\n" " free sibling\n"); printk(" task PC stack pid father child younger older\n"); #endif read_lock(&tasklist_lock); for_each_task(p) show_task((p->tarray_ptr - &task[0]),p); read_unlock(&tasklist_lock); } void __init sched_init(void) { /* * We have to do a little magic to get the first * process right in SMP mode. */ int cpu=hard_smp_processor_id(); int nr = NR_TASKS; init_task.processor=cpu; /* Init task array free list and pidhash table. */ while(--nr > 0) add_free_taskslot(&task[nr]); for(nr = 0; nr < PIDHASH_SZ; nr++) pidhash[nr] = NULL; init_bh(TIMER_BH, timer_bh); init_bh(TQUEUE_BH, tqueue_bh); init_bh(IMMEDIATE_BH, immediate_bh); }